Geographic Analysis

Introduction

bla balal


import findspark
findspark.init()

from pyspark.sql import SparkSession
import pandas as pd
import plotly.express as px
import plotly.io as pio
import numpy as np

np.random.seed(42)

pio.renderers.default = "notebook"

# Initialize Spark Session
spark = SparkSession.builder.appName("LightcastData").getOrCreate()

# Load Data
df = spark.read.option("header", "true").option("inferSchema", "true").option("multiLine","true").option("escape", "\"").csv("./data/lightcast_job_postings.csv")

# Show Schema and Sample Data
print("---This is Diagnostic check, No need to print it in the final doc---")

df.printSchema() # comment this line when rendering the submission
df.show(5)
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/06/25 18:31:33 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
                                                                                
---This is Diagnostic check, No need to print it in the final doc---
root
 |-- ID: string (nullable = true)
 |-- LAST_UPDATED_DATE: string (nullable = true)
 |-- LAST_UPDATED_TIMESTAMP: timestamp (nullable = true)
 |-- DUPLICATES: integer (nullable = true)
 |-- POSTED: string (nullable = true)
 |-- EXPIRED: string (nullable = true)
 |-- DURATION: integer (nullable = true)
 |-- SOURCE_TYPES: string (nullable = true)
 |-- SOURCES: string (nullable = true)
 |-- URL: string (nullable = true)
 |-- ACTIVE_URLS: string (nullable = true)
 |-- ACTIVE_SOURCES_INFO: string (nullable = true)
 |-- TITLE_RAW: string (nullable = true)
 |-- BODY: string (nullable = true)
 |-- MODELED_EXPIRED: string (nullable = true)
 |-- MODELED_DURATION: integer (nullable = true)
 |-- COMPANY: integer (nullable = true)
 |-- COMPANY_NAME: string (nullable = true)
 |-- COMPANY_RAW: string (nullable = true)
 |-- COMPANY_IS_STAFFING: boolean (nullable = true)
 |-- EDUCATION_LEVELS: string (nullable = true)
 |-- EDUCATION_LEVELS_NAME: string (nullable = true)
 |-- MIN_EDULEVELS: integer (nullable = true)
 |-- MIN_EDULEVELS_NAME: string (nullable = true)
 |-- MAX_EDULEVELS: integer (nullable = true)
 |-- MAX_EDULEVELS_NAME: string (nullable = true)
 |-- EMPLOYMENT_TYPE: integer (nullable = true)
 |-- EMPLOYMENT_TYPE_NAME: string (nullable = true)
 |-- MIN_YEARS_EXPERIENCE: integer (nullable = true)
 |-- MAX_YEARS_EXPERIENCE: integer (nullable = true)
 |-- IS_INTERNSHIP: boolean (nullable = true)
 |-- SALARY: integer (nullable = true)
 |-- REMOTE_TYPE: integer (nullable = true)
 |-- REMOTE_TYPE_NAME: string (nullable = true)
 |-- ORIGINAL_PAY_PERIOD: string (nullable = true)
 |-- SALARY_TO: integer (nullable = true)
 |-- SALARY_FROM: integer (nullable = true)
 |-- LOCATION: string (nullable = true)
 |-- CITY: string (nullable = true)
 |-- CITY_NAME: string (nullable = true)
 |-- COUNTY: integer (nullable = true)
 |-- COUNTY_NAME: string (nullable = true)
 |-- MSA: integer (nullable = true)
 |-- MSA_NAME: string (nullable = true)
 |-- STATE: integer (nullable = true)
 |-- STATE_NAME: string (nullable = true)
 |-- COUNTY_OUTGOING: integer (nullable = true)
 |-- COUNTY_NAME_OUTGOING: string (nullable = true)
 |-- COUNTY_INCOMING: integer (nullable = true)
 |-- COUNTY_NAME_INCOMING: string (nullable = true)
 |-- MSA_OUTGOING: integer (nullable = true)
 |-- MSA_NAME_OUTGOING: string (nullable = true)
 |-- MSA_INCOMING: integer (nullable = true)
 |-- MSA_NAME_INCOMING: string (nullable = true)
 |-- NAICS2: integer (nullable = true)
 |-- NAICS2_NAME: string (nullable = true)
 |-- NAICS3: integer (nullable = true)
 |-- NAICS3_NAME: string (nullable = true)
 |-- NAICS4: integer (nullable = true)
 |-- NAICS4_NAME: string (nullable = true)
 |-- NAICS5: integer (nullable = true)
 |-- NAICS5_NAME: string (nullable = true)
 |-- NAICS6: integer (nullable = true)
 |-- NAICS6_NAME: string (nullable = true)
 |-- TITLE: string (nullable = true)
 |-- TITLE_NAME: string (nullable = true)
 |-- TITLE_CLEAN: string (nullable = true)
 |-- SKILLS: string (nullable = true)
 |-- SKILLS_NAME: string (nullable = true)
 |-- SPECIALIZED_SKILLS: string (nullable = true)
 |-- SPECIALIZED_SKILLS_NAME: string (nullable = true)
 |-- CERTIFICATIONS: string (nullable = true)
 |-- CERTIFICATIONS_NAME: string (nullable = true)
 |-- COMMON_SKILLS: string (nullable = true)
 |-- COMMON_SKILLS_NAME: string (nullable = true)
 |-- SOFTWARE_SKILLS: string (nullable = true)
 |-- SOFTWARE_SKILLS_NAME: string (nullable = true)
 |-- ONET: string (nullable = true)
 |-- ONET_NAME: string (nullable = true)
 |-- ONET_2019: string (nullable = true)
 |-- ONET_2019_NAME: string (nullable = true)
 |-- CIP6: string (nullable = true)
 |-- CIP6_NAME: string (nullable = true)
 |-- CIP4: string (nullable = true)
 |-- CIP4_NAME: string (nullable = true)
 |-- CIP2: string (nullable = true)
 |-- CIP2_NAME: string (nullable = true)
 |-- SOC_2021_2: string (nullable = true)
 |-- SOC_2021_2_NAME: string (nullable = true)
 |-- SOC_2021_3: string (nullable = true)
 |-- SOC_2021_3_NAME: string (nullable = true)
 |-- SOC_2021_4: string (nullable = true)
 |-- SOC_2021_4_NAME: string (nullable = true)
 |-- SOC_2021_5: string (nullable = true)
 |-- SOC_2021_5_NAME: string (nullable = true)
 |-- LOT_CAREER_AREA: integer (nullable = true)
 |-- LOT_CAREER_AREA_NAME: string (nullable = true)
 |-- LOT_OCCUPATION: integer (nullable = true)
 |-- LOT_OCCUPATION_NAME: string (nullable = true)
 |-- LOT_SPECIALIZED_OCCUPATION: integer (nullable = true)
 |-- LOT_SPECIALIZED_OCCUPATION_NAME: string (nullable = true)
 |-- LOT_OCCUPATION_GROUP: integer (nullable = true)
 |-- LOT_OCCUPATION_GROUP_NAME: string (nullable = true)
 |-- LOT_V6_SPECIALIZED_OCCUPATION: integer (nullable = true)
 |-- LOT_V6_SPECIALIZED_OCCUPATION_NAME: string (nullable = true)
 |-- LOT_V6_OCCUPATION: integer (nullable = true)
 |-- LOT_V6_OCCUPATION_NAME: string (nullable = true)
 |-- LOT_V6_OCCUPATION_GROUP: integer (nullable = true)
 |-- LOT_V6_OCCUPATION_GROUP_NAME: string (nullable = true)
 |-- LOT_V6_CAREER_AREA: integer (nullable = true)
 |-- LOT_V6_CAREER_AREA_NAME: string (nullable = true)
 |-- SOC_2: string (nullable = true)
 |-- SOC_2_NAME: string (nullable = true)
 |-- SOC_3: string (nullable = true)
 |-- SOC_3_NAME: string (nullable = true)
 |-- SOC_4: string (nullable = true)
 |-- SOC_4_NAME: string (nullable = true)
 |-- SOC_5: string (nullable = true)
 |-- SOC_5_NAME: string (nullable = true)
 |-- LIGHTCAST_SECTORS: string (nullable = true)
 |-- LIGHTCAST_SECTORS_NAME: string (nullable = true)
 |-- NAICS_2022_2: integer (nullable = true)
 |-- NAICS_2022_2_NAME: string (nullable = true)
 |-- NAICS_2022_3: integer (nullable = true)
 |-- NAICS_2022_3_NAME: string (nullable = true)
 |-- NAICS_2022_4: integer (nullable = true)
 |-- NAICS_2022_4_NAME: string (nullable = true)
 |-- NAICS_2022_5: integer (nullable = true)
 |-- NAICS_2022_5_NAME: string (nullable = true)
 |-- NAICS_2022_6: integer (nullable = true)
 |-- NAICS_2022_6_NAME: string (nullable = true)
25/06/25 18:31:57 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                
+--------------------+-----------------+----------------------+----------+--------+---------+--------+--------------------+--------------------+--------------------+-----------+-------------------+--------------------+--------------------+---------------+----------------+--------+--------------------+-----------+-------------------+----------------+---------------------+-------------+-------------------+-------------+------------------+---------------+--------------------+--------------------+--------------------+-------------+------+-----------+----------------+-------------------+---------+-----------+--------------------+--------------------+-------------+------+--------------+-----+--------------------+-----+----------+---------------+--------------------+---------------+--------------------+------------+--------------------+------------+--------------------+------+--------------------+------+--------------------+------+--------------------+------+--------------------+------+--------------------+------------------+-------------------+--------------------+--------------------+--------------------+--------------------+-----------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+--------------------+----------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+--------------------+----------+--------------------+----------+---------------+----------+---------------+---------------+--------------------+--------------+--------------------+--------------------------+-------------------------------+--------------------+-------------------------+-----------------------------+----------------------------------+-----------------+----------------------+-----------------------+----------------------------+------------------+-----------------------+-------+--------------------+-------+--------------------+-------+---------------+-------+---------------+-----------------+----------------------+------------+--------------------+------------+--------------------+------------+--------------------+------------+--------------------+------------+--------------------+
|                  ID|LAST_UPDATED_DATE|LAST_UPDATED_TIMESTAMP|DUPLICATES|  POSTED|  EXPIRED|DURATION|        SOURCE_TYPES|             SOURCES|                 URL|ACTIVE_URLS|ACTIVE_SOURCES_INFO|           TITLE_RAW|                BODY|MODELED_EXPIRED|MODELED_DURATION| COMPANY|        COMPANY_NAME|COMPANY_RAW|COMPANY_IS_STAFFING|EDUCATION_LEVELS|EDUCATION_LEVELS_NAME|MIN_EDULEVELS| MIN_EDULEVELS_NAME|MAX_EDULEVELS|MAX_EDULEVELS_NAME|EMPLOYMENT_TYPE|EMPLOYMENT_TYPE_NAME|MIN_YEARS_EXPERIENCE|MAX_YEARS_EXPERIENCE|IS_INTERNSHIP|SALARY|REMOTE_TYPE|REMOTE_TYPE_NAME|ORIGINAL_PAY_PERIOD|SALARY_TO|SALARY_FROM|            LOCATION|                CITY|    CITY_NAME|COUNTY|   COUNTY_NAME|  MSA|            MSA_NAME|STATE|STATE_NAME|COUNTY_OUTGOING|COUNTY_NAME_OUTGOING|COUNTY_INCOMING|COUNTY_NAME_INCOMING|MSA_OUTGOING|   MSA_NAME_OUTGOING|MSA_INCOMING|   MSA_NAME_INCOMING|NAICS2|         NAICS2_NAME|NAICS3|         NAICS3_NAME|NAICS4|         NAICS4_NAME|NAICS5|         NAICS5_NAME|NAICS6|         NAICS6_NAME|             TITLE|         TITLE_NAME|         TITLE_CLEAN|              SKILLS|         SKILLS_NAME|  SPECIALIZED_SKILLS|SPECIALIZED_SKILLS_NAME|      CERTIFICATIONS| CERTIFICATIONS_NAME|       COMMON_SKILLS|  COMMON_SKILLS_NAME|     SOFTWARE_SKILLS|SOFTWARE_SKILLS_NAME|      ONET|           ONET_NAME| ONET_2019|      ONET_2019_NAME|                CIP6|           CIP6_NAME|                CIP4|           CIP4_NAME|                CIP2|           CIP2_NAME|SOC_2021_2|     SOC_2021_2_NAME|SOC_2021_3|     SOC_2021_3_NAME|SOC_2021_4|SOC_2021_4_NAME|SOC_2021_5|SOC_2021_5_NAME|LOT_CAREER_AREA|LOT_CAREER_AREA_NAME|LOT_OCCUPATION| LOT_OCCUPATION_NAME|LOT_SPECIALIZED_OCCUPATION|LOT_SPECIALIZED_OCCUPATION_NAME|LOT_OCCUPATION_GROUP|LOT_OCCUPATION_GROUP_NAME|LOT_V6_SPECIALIZED_OCCUPATION|LOT_V6_SPECIALIZED_OCCUPATION_NAME|LOT_V6_OCCUPATION|LOT_V6_OCCUPATION_NAME|LOT_V6_OCCUPATION_GROUP|LOT_V6_OCCUPATION_GROUP_NAME|LOT_V6_CAREER_AREA|LOT_V6_CAREER_AREA_NAME|  SOC_2|          SOC_2_NAME|  SOC_3|          SOC_3_NAME|  SOC_4|     SOC_4_NAME|  SOC_5|     SOC_5_NAME|LIGHTCAST_SECTORS|LIGHTCAST_SECTORS_NAME|NAICS_2022_2|   NAICS_2022_2_NAME|NAICS_2022_3|   NAICS_2022_3_NAME|NAICS_2022_4|   NAICS_2022_4_NAME|NAICS_2022_5|   NAICS_2022_5_NAME|NAICS_2022_6|   NAICS_2022_6_NAME|
+--------------------+-----------------+----------------------+----------+--------+---------+--------+--------------------+--------------------+--------------------+-----------+-------------------+--------------------+--------------------+---------------+----------------+--------+--------------------+-----------+-------------------+----------------+---------------------+-------------+-------------------+-------------+------------------+---------------+--------------------+--------------------+--------------------+-------------+------+-----------+----------------+-------------------+---------+-----------+--------------------+--------------------+-------------+------+--------------+-----+--------------------+-----+----------+---------------+--------------------+---------------+--------------------+------------+--------------------+------------+--------------------+------+--------------------+------+--------------------+------+--------------------+------+--------------------+------+--------------------+------------------+-------------------+--------------------+--------------------+--------------------+--------------------+-----------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+--------------------+----------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+--------------------+----------+--------------------+----------+---------------+----------+---------------+---------------+--------------------+--------------+--------------------+--------------------------+-------------------------------+--------------------+-------------------------+-----------------------------+----------------------------------+-----------------+----------------------+-----------------------+----------------------------+------------------+-----------------------+-------+--------------------+-------+--------------------+-------+---------------+-------+---------------+-----------------+----------------------+------------+--------------------+------------+--------------------+------------+--------------------+------------+--------------------+------------+--------------------+
|1f57d95acf4dc67ed...|         9/6/2024|  2024-09-06 20:32:...|         0|6/2/2024| 6/8/2024|       6|   [\n  "Company"\n]|[\n  "brassring.c...|[\n  "https://sjo...|         []|               NULL|Enterprise Analys...|31-May-2024\n\nEn...|       6/8/2024|               6|  894731|          Murphy USA| Murphy USA|              false|       [\n  2\n]| [\n  "Bachelor's ...|            2|  Bachelor's degree|         NULL|              NULL|              1|Full-time (> 32 h...|                   2|                   2|        false|  NULL|          0|          [None]|               NULL|     NULL|       NULL|{\n  "lat": 33.20...|RWwgRG9yYWRvLCBBUg==|El Dorado, AR|  5139|     Union, AR|20980|       El Dorado, AR|    5|  Arkansas|           5139|           Union, AR|           5139|           Union, AR|       20980|       El Dorado, AR|       20980|       El Dorado, AR|    44|        Retail Trade|   441|Motor Vehicle and...|  4413|Automotive Parts,...| 44133|Automotive Parts ...|441330|Automotive Parts ...|ET29C073C03D1F86B4|Enterprise Analysts|enterprise analys...|[\n  "KS126DB6T06...|[\n  "Merchandisi...|[\n  "KS126DB6T06...|   [\n  "Merchandisi...|                  []|                  []|[\n  "KS126706DPF...|[\n  "Mathematics...|[\n  "KS440W865GC...|[\n  "SQL (Progra...|15-2051.01|Business Intellig...|15-2051.01|Business Intellig...|[\n  "45.0601",\n...|[\n  "Economics, ...|[\n  "45.06",\n  ...|[\n  "Economics",...|[\n  "45",\n  "27...|[\n  "Social Scie...|   15-0000|Computer and Math...|   15-2000|Mathematical Scie...|   15-2050|Data Scientists|   15-2051|Data Scientists|             23|Information Techn...|        231010|Business Intellig...|                  23101011|           General ERP Analy...|                2310|     Business Intellig...|                     23101011|              General ERP Analy...|           231010|  Business Intellig...|                   2310|        Business Intellig...|                23|   Information Techn...|15-0000|Computer and Math...|15-2000|Mathematical Scie...|15-2050|Data Scientists|15-2051|Data Scientists|        [\n  7\n]|  [\n  "Artificial ...|          44|        Retail Trade|         441|Motor Vehicle and...|        4413|Automotive Parts,...|       44133|Automotive Parts ...|      441330|Automotive Parts ...|
|0cb072af26757b6c4...|         8/2/2024|  2024-08-02 17:08:...|         0|6/2/2024| 8/1/2024|    NULL| [\n  "Job Board"\n]| [\n  "maine.gov"\n]|[\n  "https://job...|         []|               NULL|Oracle Consultant...|Oracle Consultant...|       8/1/2024|            NULL|  133098|Smx Corporation L...|        SMX|               true|      [\n  99\n]| [\n  "No Educatio...|           99|No Education Listed|         NULL|              NULL|              1|Full-time (> 32 h...|                   3|                   3|        false|  NULL|          1|          Remote|               NULL|     NULL|       NULL|{\n  "lat": 44.31...|    QXVndXN0YSwgTUU=|  Augusta, ME| 23011|  Kennebec, ME|12300|Augusta-Watervill...|   23|     Maine|          23011|        Kennebec, ME|          23011|        Kennebec, ME|       12300|Augusta-Watervill...|       12300|Augusta-Watervill...|    56|Administrative an...|   561|Administrative an...|  5613| Employment Services| 56132|Temporary Help Se...|561320|Temporary Help Se...|ET21DDA63780A7DC09| Oracle Consultants|oracle consultant...|[\n  "KS122626T55...|[\n  "Procurement...|[\n  "KS122626T55...|   [\n  "Procurement...|                  []|                  []|                  []|                  []|[\n  "BGSBF3F508F...|[\n  "Oracle Busi...|15-2051.01|Business Intellig...|15-2051.01|Business Intellig...|                  []|                  []|                  []|                  []|                  []|                  []|   15-0000|Computer and Math...|   15-2000|Mathematical Scie...|   15-2050|Data Scientists|   15-2051|Data Scientists|             23|Information Techn...|        231010|Business Intellig...|                  23101012|           Oracle Consultant...|                2310|     Business Intellig...|                     23101012|              Oracle Consultant...|           231010|  Business Intellig...|                   2310|        Business Intellig...|                23|   Information Techn...|15-0000|Computer and Math...|15-2000|Mathematical Scie...|15-2050|Data Scientists|15-2051|Data Scientists|             NULL|                  NULL|          56|Administrative an...|         561|Administrative an...|        5613| Employment Services|       56132|Temporary Help Se...|      561320|Temporary Help Se...|
|85318b12b3331fa49...|         9/6/2024|  2024-09-06 20:32:...|         1|6/2/2024| 7/7/2024|      35| [\n  "Job Board"\n]|[\n  "dejobs.org"\n]|[\n  "https://dej...|         []|               NULL|        Data Analyst|Taking care of pe...|      6/10/2024|               8|39063746|            Sedgwick|   Sedgwick|              false|       [\n  2\n]| [\n  "Bachelor's ...|            2|  Bachelor's degree|         NULL|              NULL|              1|Full-time (> 32 h...|                   5|                NULL|        false|  NULL|          0|          [None]|               NULL|     NULL|       NULL|{\n  "lat": 32.77...|    RGFsbGFzLCBUWA==|   Dallas, TX| 48113|    Dallas, TX|19100|Dallas-Fort Worth...|   48|     Texas|          48113|          Dallas, TX|          48113|          Dallas, TX|       19100|Dallas-Fort Worth...|       19100|Dallas-Fort Worth...|    52|Finance and Insur...|   524|Insurance Carrier...|  5242|Agencies, Brokera...| 52429|Other Insurance R...|524291|    Claims Adjusting|ET3037E0C947A02404|      Data Analysts|        data analyst|[\n  "KS1218W78FG...|[\n  "Management"...|[\n  "ESF3939CE1F...|   [\n  "Exception R...|[\n  "KS683TN76T7...|[\n  "Security Cl...|[\n  "KS1218W78FG...|[\n  "Management"...|[\n  "KS126HY6YLT...|[\n  "Microsoft O...|15-2051.01|Business Intellig...|15-2051.01|Business Intellig...|                  []|                  []|                  []|                  []|                  []|                  []|   15-0000|Computer and Math...|   15-2000|Mathematical Scie...|   15-2050|Data Scientists|   15-2051|Data Scientists|             23|Information Techn...|        231113|Data / Data Minin...|                  23111310|                   Data Analyst|                2311|     Data Analysis and...|                     23111310|                      Data Analyst|           231113|  Data / Data Minin...|                   2311|        Data Analysis and...|                23|   Information Techn...|15-0000|Computer and Math...|15-2000|Mathematical Scie...|15-2050|Data Scientists|15-2051|Data Scientists|             NULL|                  NULL|          52|Finance and Insur...|         524|Insurance Carrier...|        5242|Agencies, Brokera...|       52429|Other Insurance R...|      524291|    Claims Adjusting|
|1b5c3941e54a1889e...|         9/6/2024|  2024-09-06 20:32:...|         1|6/2/2024|7/20/2024|      48| [\n  "Job Board"\n]|[\n  "disabledper...|[\n  "https://www...|         []|               NULL|Sr. Lead Data Mgm...|About this role:\...|      6/12/2024|              10|37615159|         Wells Fargo|Wells Fargo|              false|      [\n  99\n]| [\n  "No Educatio...|           99|No Education Listed|         NULL|              NULL|              1|Full-time (> 32 h...|                   3|                NULL|        false|  NULL|          0|          [None]|               NULL|     NULL|       NULL|{\n  "lat": 33.44...|    UGhvZW5peCwgQVo=|  Phoenix, AZ|  4013|  Maricopa, AZ|38060|Phoenix-Mesa-Chan...|    4|   Arizona|           4013|        Maricopa, AZ|           4013|        Maricopa, AZ|       38060|Phoenix-Mesa-Chan...|       38060|Phoenix-Mesa-Chan...|    52|Finance and Insur...|   522|Credit Intermedia...|  5221|Depository Credit...| 52211|  Commercial Banking|522110|  Commercial Banking|ET2114E0404BA30075|Management Analysts|sr lead data mgmt...|[\n  "KS123QX62QY...|[\n  "Exit Strate...|[\n  "KS123QX62QY...|   [\n  "Exit Strate...|                  []|                  []|[\n  "KS7G6NP6R6L...|[\n  "Reliability...|[\n  "KS4409D76NW...|[\n  "SAS (Softwa...|15-2051.01|Business Intellig...|15-2051.01|Business Intellig...|                  []|                  []|                  []|                  []|                  []|                  []|   15-0000|Computer and Math...|   15-2000|Mathematical Scie...|   15-2050|Data Scientists|   15-2051|Data Scientists|             23|Information Techn...|        231113|Data / Data Minin...|                  23111310|                   Data Analyst|                2311|     Data Analysis and...|                     23111310|                      Data Analyst|           231113|  Data / Data Minin...|                   2311|        Data Analysis and...|                23|   Information Techn...|15-0000|Computer and Math...|15-2000|Mathematical Scie...|15-2050|Data Scientists|15-2051|Data Scientists|        [\n  6\n]|  [\n  "Data Privac...|          52|Finance and Insur...|         522|Credit Intermedia...|        5221|Depository Credit...|       52211|  Commercial Banking|      522110|  Commercial Banking|
|cb5ca25f02bdf25c1...|        6/19/2024|   2024-06-19 07:00:00|         0|6/2/2024|6/17/2024|      15|[\n  "FreeJobBoar...|[\n  "craigslist....|[\n  "https://mod...|         []|               NULL|Comisiones de $10...|Comisiones de $10...|      6/17/2024|              15|       0|        Unclassified|      LH/GM|              false|      [\n  99\n]| [\n  "No Educatio...|           99|No Education Listed|         NULL|              NULL|              3|Part-time / full-...|                NULL|                NULL|        false| 92500|          0|          [None]|               year|   150000|      35000|{\n  "lat": 37.63...|    TW9kZXN0bywgQ0E=|  Modesto, CA|  6099|Stanislaus, CA|33700|         Modesto, CA|    6|California|           6099|      Stanislaus, CA|           6099|      Stanislaus, CA|       33700|         Modesto, CA|       33700|         Modesto, CA|    99|Unclassified Indu...|   999|Unclassified Indu...|  9999|Unclassified Indu...| 99999|Unclassified Indu...|999999|Unclassified Indu...|ET0000000000000000|       Unclassified|comisiones de por...|                  []|                  []|                  []|                     []|                  []|                  []|                  []|                  []|                  []|                  []|15-2051.01|Business Intellig...|15-2051.01|Business Intellig...|                  []|                  []|                  []|                  []|                  []|                  []|   15-0000|Computer and Math...|   15-2000|Mathematical Scie...|   15-2050|Data Scientists|   15-2051|Data Scientists|             23|Information Techn...|        231010|Business Intellig...|                  23101012|           Oracle Consultant...|                2310|     Business Intellig...|                     23101012|              Oracle Consultant...|           231010|  Business Intellig...|                   2310|        Business Intellig...|                23|   Information Techn...|15-0000|Computer and Math...|15-2000|Mathematical Scie...|15-2050|Data Scientists|15-2051|Data Scientists|             NULL|                  NULL|          99|Unclassified Indu...|         999|Unclassified Indu...|        9999|Unclassified Indu...|       99999|Unclassified Indu...|      999999|Unclassified Indu...|
+--------------------+-----------------+----------------------+----------+--------+---------+--------+--------------------+--------------------+--------------------+-----------+-------------------+--------------------+--------------------+---------------+----------------+--------+--------------------+-----------+-------------------+----------------+---------------------+-------------+-------------------+-------------+------------------+---------------+--------------------+--------------------+--------------------+-------------+------+-----------+----------------+-------------------+---------+-----------+--------------------+--------------------+-------------+------+--------------+-----+--------------------+-----+----------+---------------+--------------------+---------------+--------------------+------------+--------------------+------------+--------------------+------+--------------------+------+--------------------+------+--------------------+------+--------------------+------+--------------------+------------------+-------------------+--------------------+--------------------+--------------------+--------------------+-----------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+--------------------+----------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+--------------------+----------+--------------------+----------+---------------+----------+---------------+---------------+--------------------+--------------+--------------------+--------------------------+-------------------------------+--------------------+-------------------------+-----------------------------+----------------------------------+-----------------+----------------------+-----------------------+----------------------------+------------------+-----------------------+-------+--------------------+-------+--------------------+-------+---------------+-------+---------------+-----------------+----------------------+------------+--------------------+------------+--------------------+------------+--------------------+------------+--------------------+------------+--------------------+
only showing top 5 rows
## Listing Columns So We Can Reference them in Visuals

import pandas as pd
df = pd.read_csv("./data/lightcast_job_postings.csv")
print(df.columns.tolist())
/tmp/ipykernel_2724/3265774581.py:4: DtypeWarning:

Columns (19,30) have mixed types. Specify dtype option on import or set low_memory=False.
['ID', 'LAST_UPDATED_DATE', 'LAST_UPDATED_TIMESTAMP', 'DUPLICATES', 'POSTED', 'EXPIRED', 'DURATION', 'SOURCE_TYPES', 'SOURCES', 'URL', 'ACTIVE_URLS', 'ACTIVE_SOURCES_INFO', 'TITLE_RAW', 'BODY', 'MODELED_EXPIRED', 'MODELED_DURATION', 'COMPANY', 'COMPANY_NAME', 'COMPANY_RAW', 'COMPANY_IS_STAFFING', 'EDUCATION_LEVELS', 'EDUCATION_LEVELS_NAME', 'MIN_EDULEVELS', 'MIN_EDULEVELS_NAME', 'MAX_EDULEVELS', 'MAX_EDULEVELS_NAME', 'EMPLOYMENT_TYPE', 'EMPLOYMENT_TYPE_NAME', 'MIN_YEARS_EXPERIENCE', 'MAX_YEARS_EXPERIENCE', 'IS_INTERNSHIP', 'SALARY', 'REMOTE_TYPE', 'REMOTE_TYPE_NAME', 'ORIGINAL_PAY_PERIOD', 'SALARY_TO', 'SALARY_FROM', 'LOCATION', 'CITY', 'CITY_NAME', 'COUNTY', 'COUNTY_NAME', 'MSA', 'MSA_NAME', 'STATE', 'STATE_NAME', 'COUNTY_OUTGOING', 'COUNTY_NAME_OUTGOING', 'COUNTY_INCOMING', 'COUNTY_NAME_INCOMING', 'MSA_OUTGOING', 'MSA_NAME_OUTGOING', 'MSA_INCOMING', 'MSA_NAME_INCOMING', 'NAICS2', 'NAICS2_NAME', 'NAICS3', 'NAICS3_NAME', 'NAICS4', 'NAICS4_NAME', 'NAICS5', 'NAICS5_NAME', 'NAICS6', 'NAICS6_NAME', 'TITLE', 'TITLE_NAME', 'TITLE_CLEAN', 'SKILLS', 'SKILLS_NAME', 'SPECIALIZED_SKILLS', 'SPECIALIZED_SKILLS_NAME', 'CERTIFICATIONS', 'CERTIFICATIONS_NAME', 'COMMON_SKILLS', 'COMMON_SKILLS_NAME', 'SOFTWARE_SKILLS', 'SOFTWARE_SKILLS_NAME', 'ONET', 'ONET_NAME', 'ONET_2019', 'ONET_2019_NAME', 'CIP6', 'CIP6_NAME', 'CIP4', 'CIP4_NAME', 'CIP2', 'CIP2_NAME', 'SOC_2021_2', 'SOC_2021_2_NAME', 'SOC_2021_3', 'SOC_2021_3_NAME', 'SOC_2021_4', 'SOC_2021_4_NAME', 'SOC_2021_5', 'SOC_2021_5_NAME', 'LOT_CAREER_AREA', 'LOT_CAREER_AREA_NAME', 'LOT_OCCUPATION', 'LOT_OCCUPATION_NAME', 'LOT_SPECIALIZED_OCCUPATION', 'LOT_SPECIALIZED_OCCUPATION_NAME', 'LOT_OCCUPATION_GROUP', 'LOT_OCCUPATION_GROUP_NAME', 'LOT_V6_SPECIALIZED_OCCUPATION', 'LOT_V6_SPECIALIZED_OCCUPATION_NAME', 'LOT_V6_OCCUPATION', 'LOT_V6_OCCUPATION_NAME', 'LOT_V6_OCCUPATION_GROUP', 'LOT_V6_OCCUPATION_GROUP_NAME', 'LOT_V6_CAREER_AREA', 'LOT_V6_CAREER_AREA_NAME', 'SOC_2', 'SOC_2_NAME', 'SOC_3', 'SOC_3_NAME', 'SOC_4', 'SOC_4_NAME', 'SOC_5', 'SOC_5_NAME', 'LIGHTCAST_SECTORS', 'LIGHTCAST_SECTORS_NAME', 'NAICS_2022_2', 'NAICS_2022_2_NAME', 'NAICS_2022_3', 'NAICS_2022_3_NAME', 'NAICS_2022_4', 'NAICS_2022_4_NAME', 'NAICS_2022_5', 'NAICS_2022_5_NAME', 'NAICS_2022_6', 'NAICS_2022_6_NAME']
import pandas as pd

df = pd.read_csv("./data/lightcast_job_postings.csv")

columns_to_drop = [
    "ID", "URL", "ACTIVE_URLS", "DUPLICATES", "LAST_UPDATED_TIMESTAMP",
    "NAICS2", "NAICS3", "NAICS4", "NAICS5", "NAICS6",
    "SOC_2", "SOC_3", "SOC_5"
]

df.drop(columns=columns_to_drop, inplace=True)
/tmp/ipykernel_2724/304705447.py:3: DtypeWarning:

Columns (19,30) have mixed types. Specify dtype option on import or set low_memory=False.
print(df.columns.tolist())
['LAST_UPDATED_DATE', 'POSTED', 'EXPIRED', 'DURATION', 'SOURCE_TYPES', 'SOURCES', 'ACTIVE_SOURCES_INFO', 'TITLE_RAW', 'BODY', 'MODELED_EXPIRED', 'MODELED_DURATION', 'COMPANY', 'COMPANY_NAME', 'COMPANY_RAW', 'COMPANY_IS_STAFFING', 'EDUCATION_LEVELS', 'EDUCATION_LEVELS_NAME', 'MIN_EDULEVELS', 'MIN_EDULEVELS_NAME', 'MAX_EDULEVELS', 'MAX_EDULEVELS_NAME', 'EMPLOYMENT_TYPE', 'EMPLOYMENT_TYPE_NAME', 'MIN_YEARS_EXPERIENCE', 'MAX_YEARS_EXPERIENCE', 'IS_INTERNSHIP', 'SALARY', 'REMOTE_TYPE', 'REMOTE_TYPE_NAME', 'ORIGINAL_PAY_PERIOD', 'SALARY_TO', 'SALARY_FROM', 'LOCATION', 'CITY', 'CITY_NAME', 'COUNTY', 'COUNTY_NAME', 'MSA', 'MSA_NAME', 'STATE', 'STATE_NAME', 'COUNTY_OUTGOING', 'COUNTY_NAME_OUTGOING', 'COUNTY_INCOMING', 'COUNTY_NAME_INCOMING', 'MSA_OUTGOING', 'MSA_NAME_OUTGOING', 'MSA_INCOMING', 'MSA_NAME_INCOMING', 'NAICS2_NAME', 'NAICS3_NAME', 'NAICS4_NAME', 'NAICS5_NAME', 'NAICS6_NAME', 'TITLE', 'TITLE_NAME', 'TITLE_CLEAN', 'SKILLS', 'SKILLS_NAME', 'SPECIALIZED_SKILLS', 'SPECIALIZED_SKILLS_NAME', 'CERTIFICATIONS', 'CERTIFICATIONS_NAME', 'COMMON_SKILLS', 'COMMON_SKILLS_NAME', 'SOFTWARE_SKILLS', 'SOFTWARE_SKILLS_NAME', 'ONET', 'ONET_NAME', 'ONET_2019', 'ONET_2019_NAME', 'CIP6', 'CIP6_NAME', 'CIP4', 'CIP4_NAME', 'CIP2', 'CIP2_NAME', 'SOC_2021_2', 'SOC_2021_2_NAME', 'SOC_2021_3', 'SOC_2021_3_NAME', 'SOC_2021_4', 'SOC_2021_4_NAME', 'SOC_2021_5', 'SOC_2021_5_NAME', 'LOT_CAREER_AREA', 'LOT_CAREER_AREA_NAME', 'LOT_OCCUPATION', 'LOT_OCCUPATION_NAME', 'LOT_SPECIALIZED_OCCUPATION', 'LOT_SPECIALIZED_OCCUPATION_NAME', 'LOT_OCCUPATION_GROUP', 'LOT_OCCUPATION_GROUP_NAME', 'LOT_V6_SPECIALIZED_OCCUPATION', 'LOT_V6_SPECIALIZED_OCCUPATION_NAME', 'LOT_V6_OCCUPATION', 'LOT_V6_OCCUPATION_NAME', 'LOT_V6_OCCUPATION_GROUP', 'LOT_V6_OCCUPATION_GROUP_NAME', 'LOT_V6_CAREER_AREA', 'LOT_V6_CAREER_AREA_NAME', 'SOC_2_NAME', 'SOC_3_NAME', 'SOC_4', 'SOC_4_NAME', 'SOC_5_NAME', 'LIGHTCAST_SECTORS', 'LIGHTCAST_SECTORS_NAME', 'NAICS_2022_2', 'NAICS_2022_2_NAME', 'NAICS_2022_3', 'NAICS_2022_3_NAME', 'NAICS_2022_4', 'NAICS_2022_4_NAME', 'NAICS_2022_5', 'NAICS_2022_5_NAME', 'NAICS_2022_6', 'NAICS_2022_6_NAME']
!pip install missingno
Requirement already satisfied: missingno in ./.venv/lib/python3.12/site-packages (0.5.2)
Requirement already satisfied: numpy in ./.venv/lib/python3.12/site-packages (from missingno) (2.2.6)
Requirement already satisfied: matplotlib in ./.venv/lib/python3.12/site-packages (from missingno) (3.10.3)
Requirement already satisfied: scipy in ./.venv/lib/python3.12/site-packages (from missingno) (1.15.3)
Requirement already satisfied: seaborn in ./.venv/lib/python3.12/site-packages (from missingno) (0.13.2)
Requirement already satisfied: contourpy>=1.0.1 in ./.venv/lib/python3.12/site-packages (from matplotlib->missingno) (1.3.2)
Requirement already satisfied: cycler>=0.10 in ./.venv/lib/python3.12/site-packages (from matplotlib->missingno) (0.12.1)
Requirement already satisfied: fonttools>=4.22.0 in ./.venv/lib/python3.12/site-packages (from matplotlib->missingno) (4.58.0)
Requirement already satisfied: kiwisolver>=1.3.1 in ./.venv/lib/python3.12/site-packages (from matplotlib->missingno) (1.4.8)
Requirement already satisfied: packaging>=20.0 in ./.venv/lib/python3.12/site-packages (from matplotlib->missingno) (25.0)
Requirement already satisfied: pillow>=8 in ./.venv/lib/python3.12/site-packages (from matplotlib->missingno) (11.2.1)
Requirement already satisfied: pyparsing>=2.3.1 in ./.venv/lib/python3.12/site-packages (from matplotlib->missingno) (3.2.3)
Requirement already satisfied: python-dateutil>=2.7 in ./.venv/lib/python3.12/site-packages (from matplotlib->missingno) (2.9.0.post0)
Requirement already satisfied: pandas>=1.2 in ./.venv/lib/python3.12/site-packages (from seaborn->missingno) (2.2.3)
Requirement already satisfied: pytz>=2020.1 in ./.venv/lib/python3.12/site-packages (from pandas>=1.2->seaborn->missingno) (2025.2)
Requirement already satisfied: tzdata>=2022.7 in ./.venv/lib/python3.12/site-packages (from pandas>=1.2->seaborn->missingno) (2025.2)
Requirement already satisfied: six>=1.5 in ./.venv/lib/python3.12/site-packages (from python-dateutil>=2.7->matplotlib->missingno) (1.17.0)
import missingno as msno
import matplotlib.pyplot as plt
# Visualize missing values
msno.heatmap(df)
plt.title("Missing Values Heatmap")
plt.show()

# Drop columns with >50% missing values
df.dropna(thresh=len(df) * 0.5, axis=1, inplace=True)

# Fill only the columns you actually have
if 'Industry' in df.columns:
    df["Industry"].fillna("Unknown", inplace=True)
    df["Salary"].fillna(df["Salary"].median(), inplace=True)

df = df.drop_duplicates(subset=["TITLE", "COMPANY", "LOCATION", "POSTED"], keep="first")
df = df[df['NAICS_2022_2_NAME'] != 'Unclassified Industry']
df['REMOTE_TYPE_NAME'] = df['REMOTE_TYPE_NAME'].replace('[None]', 'Unknown')

import pandas as pd
import plotly.express as px

# Sample data setup (replace this with your actual dataframe)
data = {
    'Industry': [
        'Wholesale Trade', 'Retail Trade', 'Real Estate and Rental and Leasing',
        'Professional, Scientific, and Technical Services', 'Manufacturing',
        'Information', 'Health Care and Social Assistance',
        'Finance and Insurance', 'Educational Services',
        'Administrative and Support and Waste Management and Remediation Services'
    ],
    'Flexible Remote': [87.8, 94.4, 97.6, 92.2, 89.7, 95.8, 92.1, 94.8, 89.0, 94.8],
    'Onsite': [12.2, 5.6, 2.4, 7.8, 10.3, 4.2, 7.9, 5.2, 11.0, 5.2]
}

df = pd.DataFrame(data)

# Melt the data for Plotly
df_melted = df.melt(id_vars='Industry', value_vars=['Flexible Remote', 'Onsite'],
                    var_name='Remote Type', value_name='Percentage')

# Sort industries by Flexible Remote % descending
df['SortKey'] = df['Flexible Remote']
df = df.sort_values(by='SortKey', ascending=False)
df_melted['Industry'] = pd.Categorical(df_melted['Industry'], categories=df['Industry'], ordered=True)

# Plot
fig = px.bar(
    df_melted,
    x="Percentage",
    y="Industry",
    color="Remote Type",
    orientation="h",
    text="Percentage",
    color_discrete_map={"Flexible Remote": "#636EFA", "Onsite": "#EF553B"},
    title="Remote Job Distribution by Industry (Top 10 Industries)"
)

# Clean layout, shift bars left
fig.update_layout(
    xaxis_title="Percentage of Jobs",
    yaxis_title="",
    xaxis=dict(tickformat=".0f"),
    legend_title="Remote Type",
    barmode='stack',
    margin=dict(l=10, r=10, t=60, b=40),  # Reduce margins
    height=500
)

# Format labels
fig.update_traces(texttemplate='%{text:.1f}%', textposition='inside')

fig.show()






import pandas as pd
from IPython.display import display
# 1. Load the dataset and parse dates
df = pd.read_csv("./data/lightcast_job_postings.csv", parse_dates=["POSTED", "EXPIRED"])
# Calculate DURATION in days
df['DURATION'] = (df['EXPIRED'] - df['POSTED']).dt.days
# Extract POSTED month
df['POSTED_MONTH'] = df['POSTED'].dt.to_period("M")
/tmp/ipykernel_2724/784347523.py:4: DtypeWarning:

Columns (19,30) have mixed types. Specify dtype option on import or set low_memory=False.
# Define AI and non-AI impacted NAICS codes
ai_impacted_naics = [31, 32, 33, 42, 44, 45, 51, 52, 54, 55, 56]
non_ai_impacted_naics = [11, 21, 22, 23, 48, 49, 61, 62, 71, 72]
# Create IS_AI_JOB flag
df['IS_AI_JOB'] = df['NAICS_2022_2'].apply(lambda x: 1 if pd.notna(x) and x in ai_impacted_naics else 0)

# Remove rows with missing STATE_NAME or POSTED_MONTH
df = df[df['STATE_NAME'].notna() & df['POSTED_MONTH'].notna()]

# Group by POSTED_MONTH, STATE_NAME, IS_AI_JOB to count jobs
job_counts = (
    df.groupby(['POSTED_MONTH', 'STATE_NAME', 'IS_AI_JOB'])
      .size()
      .reset_index(name='JOB_COUNT')
)
# Pivot to put months as rows, states as columns
pivot_df = job_counts.pivot_table(index=['STATE_NAME', 'IS_AI_JOB'], 
                                   columns='POSTED_MONTH', 
                                   values='JOB_COUNT', 
                                   fill_value=0)

# print(pivot_df.columns)
# print(pivot_df.columns[-1])
# pivot_df.rename(columns={'2024-05': 'May', 
# '2024-06': 'June',
# '2024-07': 'July',
# '2024-08': 'August',
# '2024-09': 'September'
# },inplace=True)
pivot_df['2024-06'].reset_index(drop=True)
0       38.0
1      110.0
2       19.0
3       42.0
4       87.0
       ...  
97      17.0
98      53.0
99     151.0
100      1.0
101     18.0
Name: 2024-06, Length: 102, dtype: float64
# Calculate % growth from first to last month
pivot_df["PCT_6"]=m_56
# pivot_df["PCT_7"]=(pivot_df['2024-07']-pivot_df['2024-06'])*100/pivot_df['2024-06']
# pivot_df["PCT_8"]=(pivot_df['2024-08']-pivot_df['2024-07'])*100/pivot_df['2024-07']
# pivot_df["PCT_9"]=(pivot_df['2024-09']-pivot_df['2024-08'])*100/pivot_df['2024-08']
pivot_df["PCT_CHANGE"]=(pivot_df['2024-09']-pivot_df['2024-05'])*100/pivot_df['2024-05']

pivot_df.head()
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[15], line 2
      1 # Calculate % growth from first to last month
----> 2 pivot_df["PCT_6"]=m_56
      3 # pivot_df["PCT_7"]=(pivot_df['2024-07']-pivot_df['2024-06'])*100/pivot_df['2024-06']
      4 # pivot_df["PCT_8"]=(pivot_df['2024-08']-pivot_df['2024-07'])*100/pivot_df['2024-07']
      5 # pivot_df["PCT_9"]=(pivot_df['2024-09']-pivot_df['2024-08'])*100/pivot_df['2024-08']
      6 pivot_df["PCT_CHANGE"]=(pivot_df['2024-09']-pivot_df['2024-05'])*100/pivot_df['2024-05']

NameError: name 'm_56' is not defined

# Drop rows with undefined % growth (divide by zero)
pivot_df = pivot_df.dropna(subset=['PCT_CHANGE'])

# Reset index for sorting
pivot_df = pivot_df.reset_index()
# Get top 10 states by % growth
top_ai = pivot_df[pivot_df['IS_AI_JOB'] == 1].sort_values('PCT_CHANGE', ascending=False).head(10)
top_non_ai = pivot_df[pivot_df['IS_AI_JOB'] == 0].sort_values('PCT_CHANGE', ascending=False).head(10)

# Combine and label
top_combined = pd.concat([
    top_ai.assign(JOB_TYPE='AI'),
    top_non_ai.assign(JOB_TYPE='Non-AI')
])

# Display 
display(top_combined[['STATE_NAME', 'IS_AI_JOB', 'PCT_CHANGE', 'JOB_TYPE']])
POSTED_MONTH STATE_NAME IS_AI_JOB PCT_CHANGE JOB_TYPE
101 Wyoming 1 220.000000 AI
97 West Virginia 1 200.000000 AI
81 South Dakota 1 66.666667 AI
61 New Mexico 1 12.000000 AI
21 Hawaii 1 0.000000 AI
67 North Dakota 1 0.000000 AI
73 Oregon 1 -0.943396 AI
89 Vermont 1 -3.333333 AI
53 Nebraska 1 -8.196721 AI
47 Mississippi 1 -8.510638 AI
88 Vermont 0 400.000000 Non-AI
50 Montana 0 166.666667 Non-AI
66 North Dakota 0 166.666667 Non-AI
96 West Virginia 0 100.000000 Non-AI
2 Alaska 0 100.000000 Non-AI
80 South Dakota 0 25.000000 Non-AI
52 Nebraska 0 25.000000 Non-AI
60 New Mexico 0 11.111111 Non-AI
64 North Carolina 0 2.777778 Non-AI
46 Mississippi 0 0.000000 Non-AI
def label_ai_impact(naics):
    try:
        code = int(naics)
        if code in ai_impacted_naics:
            return "AI-Impacted"
        elif code in non_ai_impacted_naics:
            return "Non-AI-Impacted"
        else:
            return "Unclassified"
    except:
        return "Unclassified"
df['AI_IMPACTED'] = df['NAICS_2022_2'].apply(label_ai_impact)
# Filter only AI and Non-AI impacted
df = df[df["AI_IMPACTED"].isin(["AI-Impacted", "Non-AI-Impacted"])]

# Now split the data
ai_df = df[df["AI_IMPACTED"] == "AI-Impacted"][
    ["STATE_NAME", "NAICS_2022_2", "NAICS_2022_2_NAME", "LOT_SPECIALIZED_OCCUPATION_NAME"]
]

non_ai_df = df[df["AI_IMPACTED"] == "Non-AI-Impacted"][
    ["STATE_NAME", "NAICS_2022_2", "NAICS_2022_2_NAME", "LOT_SPECIALIZED_OCCUPATION_NAME"]
]
from IPython.display import display

print("AI-Impacted Industries:")
display(ai_df)

print("\nNon-AI-Impacted Industries:")
display(non_ai_df)
AI-Impacted Industries:
STATE_NAME NAICS_2022_2 NAICS_2022_2_NAME LOT_SPECIALIZED_OCCUPATION_NAME
0 Arkansas 44.0 Retail Trade General ERP Analyst / Consultant
1 Maine 56.0 Administrative and Support and Waste Managemen... Oracle Consultant / Analyst
2 Texas 52.0 Finance and Insurance Data Analyst
3 Arizona 52.0 Finance and Insurance Data Analyst
5 Arkansas 51.0 Information Data Analyst
... ... ... ... ...
72493 Virginia 54.0 Professional, Scientific, and Technical Services Data Analyst
72494 Massachusetts 51.0 Information Enterprise Architect
72495 Michigan 56.0 Administrative and Support and Waste Managemen... Data Analyst
72496 Maine 51.0 Information Data Analyst
72497 Texas 54.0 Professional, Scientific, and Technical Services Oracle Consultant / Analyst

52444 rows × 4 columns


Non-AI-Impacted Industries:
STATE_NAME NAICS_2022_2 NAICS_2022_2_NAME LOT_SPECIALIZED_OCCUPATION_NAME
15 Massachusetts 61.0 Educational Services Data Analyst
18 Alabama 62.0 Health Care and Social Assistance Enterprise Architect
37 Virginia 62.0 Health Care and Social Assistance Data Analyst
77 Ohio 23.0 Construction Enterprise Architect
94 Texas 61.0 Educational Services Business Analyst (General)
... ... ... ... ...
72451 South Carolina 62.0 Health Care and Social Assistance Data Analyst
72460 California 11.0 Agriculture, Forestry, Fishing and Hunting Data Analyst
72472 California 61.0 Educational Services General ERP Analyst / Consultant
72479 Wyoming 61.0 Educational Services Data Analyst
72489 Texas 22.0 Utilities SAP Analyst / Admin

7942 rows × 4 columns

# Step 1: Count total jobs by state
total_jobs_by_state = df.groupby('STATE_NAME').size().rename('Total_Jobs')
# Step 2: Count AI-impacted jobs by state
ai_jobs_by_state = df[df['AI_IMPACTED'] == 'AI-Impacted'].groupby('STATE_NAME').size().rename('AI_Impacted_Jobs')
# Step 3: Merge the two counts into a single DataFrame
ai_impact_summary = pd.concat([total_jobs_by_state, ai_jobs_by_state], axis=1).fillna(0)
# Step 1: Clean and convert date columns
df = df[df['NAICS_2022_2_NAME'] != 'Unclassified Industry']
df['POSTED'] = pd.to_datetime(df['POSTED'], errors='coerce')
df['EXPIRED'] = pd.to_datetime(df['EXPIRED'], errors='coerce')

# Drop rows with missing POSTED or EXPIRED dates
df = df.dropna(subset=['POSTED', 'EXPIRED'])
import plotly.express as px

# Count postings per industry
industry_counts = df['NAICS2_NAME'].value_counts().reset_index()
industry_counts.columns = ['NAICS2_NAME', 'count']

# Sort values for better readability
industry_counts = industry_counts.sort_values(by='count', ascending=True)

# Horizontal bar plot
fig = px.bar(
    industry_counts,
    x='count',
    y='NAICS2_NAME',
    orientation='h',
    title='Job Postings by Industry',
    labels={'NAICS2_NAME': 'Industry', 'count': 'Number of Postings'},
    color='count',
    color_continuous_scale='Blues'
)

# Clean layout
fig.update_layout(
    yaxis_title='Industry',
    xaxis_title='Number of Postings',
    title_font_size=20,
    plot_bgcolor='white',
    xaxis=dict(showgrid=True),
    yaxis=dict(showgrid=False)
)

fig.show()
import plotly.express as px

# AI job growth plot
fig_ai = px.bar(
    top_ai,
    x='STATE_NAME',
    y='Avg_AI_Job_Growth',
    title='Top 10 States by Average AI Job Growth',
    labels={'STATE_NAME': 'State', 'Avg_AI_Job_Growth': 'Avg % AI Job Growth'},
    text='Avg_AI_Job_Growth'
)
fig_ai.update_traces(texttemplate='%{text:.2f}%', textposition='outside')
fig_ai.update_layout(yaxis_title='Average % Growth', xaxis_title='State')
fig_ai.show()

# Non-AI job growth plot
fig_non_ai = px.bar(
    top_non_ai,
    x='STATE_NAME',
    y='Avg_Non_AI_Job_Growth',
    title='Top 10 States by Average Non-AI Job Growth',
    labels={'STATE_NAME': 'State', 'Avg_Non_AI_Job_Growth': 'Avg % Non-AI Job Growth'},
    text='Avg_Non_AI_Job_Growth'
)
fig_non_ai.update_traces(texttemplate='%{text:.2f}%', textposition='outside')
fig_non_ai.update_layout(yaxis_title='Average % Growth', xaxis_title='State')
fig_non_ai.show()
df['AVERAGE_SALARY'] = df[['SALARY_FROM', 'SALARY_TO']].mean(axis=1)
avg_salary_by_state_type = (
    df.groupby(['STATE_NAME', 'AI_IMPACTED'])['AVERAGE_SALARY']
    .mean()
    .reset_index()
)
import numpy as np

# Step 1: Replace infinite values (result of division by zero) with NaN
pivoted.replace([np.inf, -np.inf], np.nan, inplace=True)

# Step 2: Drop rows with missing percentage growth values
pivoted.dropna(subset=['AI_Growth_Pct', 'Non_AI_Growth_Pct'], inplace=True)

# Step 3: Group by state and calculate average percent growth
avg_growth = pivoted.groupby('STATE_NAME')[['AI_Growth_Pct', 'Non_AI_Growth_Pct']].mean().reset_index()

# Step 4: Sort states by AI and Non-AI growth
ranked = avg_growth.sort_values(by=['AI_Growth_Pct', 'Non_AI_Growth_Pct'], ascending=False)

# Step 5: Display top states for AI and Non-AI job growth
from IPython.display import display
print("Top states by AI job growth:")
display(ranked[['STATE_NAME', 'AI_Growth_Pct']].head(10))

print("\nTop states by Non-AI job growth:")
display(ranked[['STATE_NAME', 'Non_AI_Growth_Pct']].sort_values(by='Non_AI_Growth_Pct', ascending=False).head(10))
Top states by AI job growth:
AI_IMPACTED STATE_NAME AI_Growth_Pct
50 Wyoming 83.333333
25 Montana 42.918193
48 West Virginia 31.637054
40 South Dakota 11.742424
44 Vermont 1.800478
30 New Mexico -0.147900
36 Oregon -0.722496
16 Kentucky -1.049939
20 Massachusetts -1.816226
43 Utah -2.449036

Top states by Non-AI job growth:
AI_IMPACTED STATE_NAME Non_AI_Growth_Pct
33 North Dakota 533.333333
40 South Dakota 220.833333
38 Rhode Island 94.166667
25 Montana 77.777778
50 Wyoming 75.000000
3 Arkansas 65.789474
1 Alaska 53.968254
48 West Virginia 53.333333
11 Idaho 47.070707
30 New Mexico 43.452381
import seaborn as sns
import matplotlib.pyplot as plt

# Step 9: Visualize
plt.figure(figsize=(14, 6))
avg_growth[['AI_Growth', 'Non_AI_Growth']].head(10).plot(kind='bar')
plt.title('Average Monthly Job Growth (Top 10 States) for AI vs. Non-AI Careers')
plt.ylabel('Average Monthly Job Growth')
plt.xticks(rotation=45)
plt.tight_layout()
plt.grid(True)
plt.show()
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
Cell In[80], line 6
      4 # Step 9: Visualize
      5 plt.figure(figsize=(14, 6))
----> 6 avg_growth[['AI_Growth', 'Non_AI_Growth']].head(10).plot(kind='bar')
      7 plt.title('Average Monthly Job Growth (Top 10 States) for AI vs. Non-AI Careers')
      8 plt.ylabel('Average Monthly Job Growth')

File ~/metad688-job-market-analysis-for-summer-2025-group2/.venv/lib/python3.12/site-packages/pandas/core/frame.py:4108, in DataFrame.__getitem__(self, key)
   4106     if is_iterator(key):
   4107         key = list(key)
-> 4108     indexer = self.columns._get_indexer_strict(key, "columns")[1]
   4110 # take() does not accept boolean indexers
   4111 if getattr(indexer, "dtype", None) == bool:

File ~/metad688-job-market-analysis-for-summer-2025-group2/.venv/lib/python3.12/site-packages/pandas/core/indexes/base.py:6200, in Index._get_indexer_strict(self, key, axis_name)
   6197 else:
   6198     keyarr, indexer, new_indexer = self._reindex_non_unique(keyarr)
-> 6200 self._raise_if_missing(keyarr, indexer, axis_name)
   6202 keyarr = self.take(indexer)
   6203 if isinstance(key, Index):
   6204     # GH 42790 - Preserve name from an Index

File ~/metad688-job-market-analysis-for-summer-2025-group2/.venv/lib/python3.12/site-packages/pandas/core/indexes/base.py:6249, in Index._raise_if_missing(self, key, indexer, axis_name)
   6247 if nmissing:
   6248     if nmissing == len(indexer):
-> 6249         raise KeyError(f"None of [{key}] are in the [{axis_name}]")
   6251     not_found = list(ensure_index(key)[missing_mask.nonzero()[0]].unique())
   6252     raise KeyError(f"{not_found} not in index")

KeyError: "None of [Index(['AI_Growth', 'Non_AI_Growth'], dtype='object', name='AI_IMPACTED')] are in the [columns]"
<Figure size 1400x600 with 0 Axes>
# Filter for Boston, MA and Austin, TX
selected_state = ['California', 'Florida', 'Massachusetts', 'Texas', 'New York']
filtered_df = df[df['STATE_NAME'].isin(selected_state)]

# Further filter for NAICS_2022_6 = 518210 and show relevant columns
final_df = filtered_df[filtered_df['LOT_SPECIALIZED_OCCUPATION_NAME'].str.contains('analyst', case=False, na=False)]
final_df[['STATE_NAME', 'NAICS2_NAME', 'NAICS_2022_6', 'LOT_SPECIALIZED_OCCUPATION_NAME']].head(100)
STATE_NAME NAICS2_NAME NAICS_2022_6 LOT_SPECIALIZED_OCCUPATION_NAME
2 Texas Finance and Insurance 524291.0 Data Analyst
4 California Unclassified Industry 999999.0 Oracle Consultant / Analyst
9 New York Professional, Scientific, and Technical Services 541511.0 Data Analyst
10 California Wholesale Trade 423830.0 Data Analyst
15 Massachusetts Educational Services 611310.0 Data Analyst
... ... ... ... ...
294 Florida Educational Services 611310.0 SAP Analyst / Admin
295 California Finance and Insurance 524114.0 Data Analyst
296 New York Unclassified Industry 999999.0 General ERP Analyst / Consultant
297 Texas Professional, Scientific, and Technical Services 541611.0 SAP Analyst / Admin
299 Texas Professional, Scientific, and Technical Services 541511.0 General ERP Analyst / Consultant

100 rows × 4 columns

# Filter for Boston, MA and Austin, TX
selected_state = ['California', 'Florida', 'Massachusetts', 'Texas', 'New York']
filtered_df = df[df['STATE_NAME'].isin(selected_state)]
# Further filter for NAICS_2022_6 = 518210 and show relevant columns
final_df = filtered_df[filtered_df['LOT_SPECIALIZED_OCCUPATION_NAME'].str.contains('analyst', case=False, na=False)]
final_df[['STATE_NAME', 'NAICS2_NAME', 'NAICS_2022_6', 'LOT_SPECIALIZED_OCCUPATION_NAME']].head(100)
STATE_NAME NAICS2_NAME NAICS_2022_6 LOT_SPECIALIZED_OCCUPATION_NAME
2 Texas Finance and Insurance 524291.0 Data Analyst
4 California Unclassified Industry 999999.0 Oracle Consultant / Analyst
9 New York Professional, Scientific, and Technical Services 541511.0 Data Analyst
10 California Wholesale Trade 423830.0 Data Analyst
15 Massachusetts Educational Services 611310.0 Data Analyst
... ... ... ... ...
294 Florida Educational Services 611310.0 SAP Analyst / Admin
295 California Finance and Insurance 524114.0 Data Analyst
296 New York Unclassified Industry 999999.0 General ERP Analyst / Consultant
297 Texas Professional, Scientific, and Technical Services 541611.0 SAP Analyst / Admin
299 Texas Professional, Scientific, and Technical Services 541511.0 General ERP Analyst / Consultant

100 rows × 4 columns

import matplotlib.pyplot as plt
import numpy as np

# Group by STATE_NAME and count jobs for NAICS_2022_6 = 518210
state_counts_jobs = final_df.groupby('STATE_NAME').size().reset_index(name='job_count')

# Sort state_counts_jobs from greatest to least by job_count
state_counts_jobs_sorted = state_counts_jobs.sort_values(by='job_count', ascending=False)

# Plot column chart
plt.figure(figsize=(8, 5))
colors = plt.cm.coolwarm(np.linspace(0, 1, len(state_counts_jobs_sorted)))
plt.bar(state_counts_jobs_sorted['STATE_NAME'], state_counts_jobs_sorted['job_count'], color=colors)
plt.xlabel('State')
plt.ylabel('Number of Jobs')
plt.title('Tech Jobs by State (Job Title Contains "Analyst")')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, f1_score, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns
#read files
file_path = "./data/lightcast_job_postings.csv"
df = pd.read_csv(file_path)
/tmp/ipykernel_2724/1191241380.py:3: DtypeWarning:

Columns (19,30) have mixed types. Specify dtype option on import or set low_memory=False.
print(df['REMOTE_TYPE_NAME'].value_counts(dropna=False).head(10))
REMOTE_TYPE_NAME
[None]           56570
Remote           12497
Hybrid Remote     2260
Not Remote        1127
NaN                 44
Name: count, dtype: int64
# Step 1: Standardize formatting
df['REMOTE_TYPE_NAME'] = (
    df['REMOTE_TYPE_NAME']
    .astype(str)
    .str.strip()
    .str.title()
    .replace({'None': pd.NA, 'Nan': pd.NA})
)
# Step 2: Fill missing or ambiguous entries with 'Not Remote'
df['REMOTE_TYPE_NAME'] = df['REMOTE_TYPE_NAME'].fillna('Not Remote')
df.loc[df['REMOTE_TYPE_NAME'] == "[None]", 'REMOTE_TYPE_NAME'] = "Not Remote"
print(df['REMOTE_TYPE_NAME'].value_counts(dropna=False).head(10))
REMOTE_TYPE_NAME
Not Remote       57741
Remote           12497
Hybrid Remote     2260
Name: count, dtype: int64
# Convert all values to strings and strip whitespace
df['REMOTE_TYPE_NAME'] = df['REMOTE_TYPE_NAME'].astype(str).str.strip()
# Apply new classification logic
df['REMOTE_BINARY'] = df['REMOTE_TYPE_NAME'].apply(
    lambda x: 1 if x in ['Remote', 'Hybrid Remote'] else 0
)
# Check result
print(df['REMOTE_TYPE_NAME'].value_counts())
print("\nBinary classification:")
print(df['REMOTE_BINARY'].value_counts())
REMOTE_TYPE_NAME
Not Remote       57741
Remote           12497
Hybrid Remote     2260
Name: count, dtype: int64

Binary classification:
REMOTE_BINARY
0    57741
1    14757
Name: count, dtype: int64
import pandas as pd
import numpy as np

# Ensure salary columns are numeric and handle missing values
df['SALARY_FROM'] = pd.to_numeric(df['SALARY_FROM'], errors='coerce').replace(0, np.nan)
df['SALARY_TO'] = pd.to_numeric(df['SALARY_TO'], errors='coerce').replace(0, np.nan)

# Calculate average salary (mean of SALARY_FROM and SALARY_TO)
df['AVERAGE_SALARY'] = df[['SALARY_FROM', 'SALARY_TO']].mean(axis=1)

# Drop rows with missing values in AVERAGE_SALARY, REMOTE_TYPE_NAME, or STATE_NAME
df_salary = df.dropna(subset=['AVERAGE_SALARY', 'REMOTE_TYPE_NAME', 'STATE_NAME'])

# Group by state and remote type, then calculate average salary
avg_salary_by_state_remote = df_salary.groupby(['STATE_NAME', 'REMOTE_TYPE_NAME'])['AVERAGE_SALARY'].mean().reset_index()

# Round the results for easier reading
avg_salary_by_state_remote['AVERAGE_SALARY'] = avg_salary_by_state_remote['AVERAGE_SALARY'].round(2)

# Show results
print(avg_salary_by_state_remote)
    STATE_NAME REMOTE_TYPE_NAME  AVERAGE_SALARY
0      Alabama    Hybrid Remote        95779.32
1      Alabama       Not Remote       110741.22
2      Alabama           Remote       112252.79
3       Alaska    Hybrid Remote        85384.29
4       Alaska       Not Remote        91767.29
..         ...              ...             ...
147  Wisconsin       Not Remote       114918.55
148  Wisconsin           Remote       104307.56
149    Wyoming    Hybrid Remote       221727.00
150    Wyoming       Not Remote       106695.18
151    Wyoming           Remote       111982.65

[152 rows x 3 columns]
df = df.merge(avg_salary_by_state_remote,
              on=['STATE_NAME', 'REMOTE_TYPE_NAME'],
              how='left')
df = df.merge(
    avg_salary_by_state_remote,
    on=['STATE_NAME', 'REMOTE_TYPE_NAME'],
    how='left',
    suffixes=('', '_STATE_REMOTE_AVG')
)
[col for col in df.columns if 'AVG' in col or 'SALARY' in col]
['SALARY',
 'SALARY_TO',
 'SALARY_FROM',
 'AVERAGE_SALARY_x',
 'AVERAGE_SALARY_y',
 'AVERAGE_SALARY']
df = df.rename(columns={'AVERAGE_SALARY_y': 'AVERAGE_SALARY_STATE_REMOTE_AVG'})
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (accuracy_score, f1_score, confusion_matrix,
                             classification_report, precision_score,
                             recall_score, balanced_accuracy_score)
from sklearn.inspection import permutation_importance
import seaborn as sns
import matplotlib.pyplot as plt
# Remove duplicate column names across full dataframe
df = df.loc[:, ~df.columns.duplicated()]
num_feats = [
    'AVG_YEARS_EXPERIENCE',
    'EXP_SPREAD',
    'AVERAGE_SALARY_STATE_REMOTE_AVG'
]

cat_feats = [
    'NAICS_2022_2_NAME',
    'STATE_NAME',
    'TITLE_CLEAN'
]

X = df[num_feats + cat_feats]
y = df['REMOTE_BINARY']
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
# Preprocessing
preprocess = ColumnTransformer(transformers=[
    ("num", StandardScaler(), num_feats),
    ("cat", OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_feats)
])
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(random_state=42, class_weight='balanced')
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=None,
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)

pipe = Pipeline(steps=[
    ('prep', preprocess),
    ('model', rf)
])
pipe.fit(X_train, y_train)
Pipeline(steps=[('prep',
                 ColumnTransformer(transformers=[('num', StandardScaler(),
                                                  ['AVG_YEARS_EXPERIENCE',
                                                   'EXP_SPREAD',
                                                   'AVERAGE_SALARY_STATE_REMOTE_AVG']),
                                                 ('cat',
                                                  OneHotEncoder(handle_unknown='ignore',
                                                                sparse_output=False),
                                                  ['NAICS_2022_2_NAME',
                                                   'STATE_NAME',
                                                   'TITLE_CLEAN'])])),
                ('model',
                 RandomForestClassifier(class_weight='balanced',
                                        n_estimators=200, n_jobs=-1,
                                        random_state=42))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
from sklearn.metrics import (
    classification_report, accuracy_score, f1_score,
    precision_score, recall_score, balanced_accuracy_score,
    confusion_matrix
)
import seaborn as sns
import matplotlib.pyplot as plt

y_pred = pipe.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall_1 = recall_score(y_test, y_pred, pos_label=1)  # sensitivity
recall_0 = recall_score(y_test, y_pred, pos_label=0)  # specificity
balanced_acc = balanced_accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)

print("Classification report:\n", classification_report(y_test, y_pred))
print(f"Accuracy:           {accuracy:.3f}")
print(f"F1 Score:           {f1:.3f}")
print(f"Precision:          {precision:.3f}")
print(f"Sensitivity (1):    {recall_1:.3f}")
print(f"Specificity (0):    {recall_0:.3f}")
print(f"Balanced Accuracy:  {balanced_acc:.3f}")
print("\nConfusion Matrix:")

sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Pred 0', 'Pred 1'],
            yticklabels=['True 0', 'True 1'])
plt.show()
Classification report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     11549
           1       1.00      0.99      0.99      2951

    accuracy                           1.00     14500
   macro avg       1.00      0.99      0.99     14500
weighted avg       1.00      1.00      1.00     14500

Accuracy:           0.997
F1 Score:           0.992
Precision:          0.999
Sensitivity (1):    0.985
Specificity (0):    1.000
Balanced Accuracy:  0.993

Confusion Matrix:

df['AVG_YEARS_EXPERIENCE'] = (df['MIN_YEARS_EXPERIENCE'] + df['MAX_YEARS_EXPERIENCE']) / 2
df['EXP_SPREAD'] = df['MAX_YEARS_EXPERIENCE'] - df['MIN_YEARS_EXPERIENCE']

df = df.drop(columns=['MIN_YEARS_EXPERIENCE', 'MAX_YEARS_EXPERIENCE'])
from sklearn.inspection import permutation_importance
import pandas as pd

# Step 1: Fit your pipeline first
pipe.fit(X_train, y_train)

# Step 2: Compute permutation importance on test set
perm = permutation_importance(pipe, X_test, y_test, n_repeats=30, random_state=42, n_jobs=-1)

# Step 3: Match feature names with importances
features = X_train.columns  # Already preprocessed column names
importances = perm.importances_mean

# Step 4: Build DataFrame and sort
importance_event_visitor = pd.DataFrame({
    'Feature': features,
    'Importance': importances
})
importance_event_visitor = importance_event_visitor.sort_values(by='Importance', ascending=False)

# Step 5: Output top 10
print("\nFeature Importance Ranking:")
print(importance_event_visitor.head(10))

Feature Importance Ranking:
                           Feature  Importance
2  AVERAGE_SALARY_STATE_REMOTE_AVG    0.426577
4                       STATE_NAME    0.346085
5                      TITLE_CLEAN    0.005887
3                NAICS_2022_2_NAME    0.001828
0             AVG_YEARS_EXPERIENCE    0.000025
1                       EXP_SPREAD    0.000000
import pandas as pd
import plotly.express as px

# Step 1: Create state abbreviation mapping
us_state_abbrev = {
    'Alabama': 'AL', 'Alaska': 'AK', 'Arizona': 'AZ', 'Arkansas': 'AR',
    'California': 'CA', 'Colorado': 'CO', 'Connecticut': 'CT', 'Delaware': 'DE',
    'Florida': 'FL', 'Georgia': 'GA', 'Hawaii': 'HI', 'Idaho': 'ID',
    'Illinois': 'IL', 'Indiana': 'IN', 'Iowa': 'IA', 'Kansas': 'KS',
    'Kentucky': 'KY', 'Louisiana': 'LA', 'Maine': 'ME', 'Maryland': 'MD',
    'Massachusetts': 'MA', 'Michigan': 'MI', 'Minnesota': 'MN', 'Mississippi': 'MS',
    'Missouri': 'MO', 'Montana': 'MT', 'Nebraska': 'NE', 'Nevada': 'NV',
    'New Hampshire': 'NH', 'New Jersey': 'NJ', 'New Mexico': 'NM', 'New York': 'NY',
    'North Carolina': 'NC', 'North Dakota': 'ND', 'Ohio': 'OH', 'Oklahoma': 'OK',
    'Oregon': 'OR', 'Pennsylvania': 'PA', 'Rhode Island': 'RI', 'South Carolina': 'SC',
    'South Dakota': 'SD', 'Tennessee': 'TN', 'Texas': 'TX', 'Utah': 'UT',
    'Vermont': 'VT', 'Virginia': 'VA', 'Washington': 'WA', 'West Virginia': 'WV',
    'Wisconsin': 'WI', 'Wyoming': 'WY', 'District of Columbia': 'DC'
}

# Step 2: Map state names to abbreviations
df['STATE_ABBR'] = df['STATE_NAME'].map(us_state_abbrev)

# Step 3: Group by state and compute metrics
choropleth_data = df.groupby('STATE_ABBR').agg(
    remote_ratio=('REMOTE_BINARY', 'mean'),
    avg_salary=('AVERAGE_SALARY_STATE_REMOTE_AVG', 'mean'),
    avg_experience=('AVG_YEARS_EXPERIENCE', 'mean'),
    job_count=('STATE_NAME', 'count')
).reset_index()

# Step 4: Create the choropleth
fig = px.choropleth(
    data_frame=choropleth_data,
    locations='STATE_ABBR',
    locationmode="USA-states",
    color='remote_ratio',
    color_continuous_scale="Blues",
    scope="usa",
    labels={'remote_ratio': 'Remote Job Ratio'},
    hover_data={
        'remote_ratio': ':.2f',
        'avg_salary': ':.0f',
        'avg_experience': ':.1f',
        'job_count': True
    },
    title='Remote Job Ratio by State (Color), Avg Salary & Experience in Hover'
)

fig.update_layout(margin={"r":0,"t":40,"l":0,"b":0})
fig.show()